{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# Audio Feature Augmentation\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "# When running this tutorial in Google Colab, install the required packages\n# with the following.\n# !pip install torchaudio librosa\n\nimport torch\nimport torchaudio\nimport torchaudio.transforms as T\n\nprint(torch.__version__)\nprint(torchaudio.__version__)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Preparing data and utility functions (skip this section)\n\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "# @title Prepare data and utility functions. {display-mode: \"form\"}\n# @markdown\n# @markdown You do not need to look into this cell.\n# @markdown Just execute once and you are good to go.\n# @markdown\n# @markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/),\n# @markdown which is licensed under Creative Commos BY 4.0.\n\n# -------------------------------------------------------------------------------\n# Preparation of data and helper functions.\n# -------------------------------------------------------------------------------\n\nimport os\n\nimport librosa\nimport matplotlib.pyplot as plt\nimport requests\n\n\n_SAMPLE_DIR = \"_assets\"\n\nSAMPLE_WAV_SPEECH_URL = \"https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav\"  # noqa: E501\nSAMPLE_WAV_SPEECH_PATH = os.path.join(_SAMPLE_DIR, \"speech.wav\")\n\nos.makedirs(_SAMPLE_DIR, exist_ok=True)\n\n\ndef _fetch_data():\n    uri = [\n        (SAMPLE_WAV_SPEECH_URL, SAMPLE_WAV_SPEECH_PATH),\n    ]\n    for url, path in uri:\n        with open(path, \"wb\") as file_:\n            file_.write(requests.get(url).content)\n\n\n_fetch_data()\n\n\ndef _get_sample(path, resample=None):\n    effects = [[\"remix\", \"1\"]]\n    if resample:\n        effects.extend(\n            [\n                [\"lowpass\", f\"{resample // 2}\"],\n                [\"rate\", f\"{resample}\"],\n            ]\n        )\n    return torchaudio.sox_effects.apply_effects_file(path, effects=effects)\n\n\ndef get_speech_sample(*, resample=None):\n    return _get_sample(SAMPLE_WAV_SPEECH_PATH, resample=resample)\n\n\ndef get_spectrogram(\n    n_fft=400,\n    win_len=None,\n    hop_len=None,\n    power=2.0,\n):\n    waveform, _ = get_speech_sample()\n    spectrogram = T.Spectrogram(\n        n_fft=n_fft,\n        win_length=win_len,\n        hop_length=hop_len,\n        center=True,\n        pad_mode=\"reflect\",\n        power=power,\n    )\n    return spectrogram(waveform)\n\n\ndef plot_spectrogram(spec, title=None, ylabel=\"freq_bin\", aspect=\"auto\", xmax=None):\n    fig, axs = plt.subplots(1, 1)\n    axs.set_title(title or \"Spectrogram (db)\")\n    axs.set_ylabel(ylabel)\n    axs.set_xlabel(\"frame\")\n    im = axs.imshow(librosa.power_to_db(spec), origin=\"lower\", aspect=aspect)\n    if xmax:\n        axs.set_xlim((0, xmax))\n    fig.colorbar(im, ax=axs)\n    plt.show(block=False)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## SpecAugment\n\n[SpecAugment](https://ai.googleblog.com/2019/04/specaugment-new-data-augmentation.html)_\nis a popular spectrogram augmentation technique.\n\n``torchaudio`` implements :py:func:`torchaudio.transforms.TimeStretch`,\n:py:func:`torchaudio.transforms.TimeMasking` and\n:py:func:`torchaudio.transforms.FrequencyMasking`.\n\n\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## TimeStretch\n\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "spec = get_spectrogram(power=None)\nstretch = T.TimeStretch()\n\nrate = 1.2\nspec_ = stretch(spec, rate)\nplot_spectrogram(torch.abs(spec_[0]), title=f\"Stretched x{rate}\", aspect=\"equal\", xmax=304)\n\nplot_spectrogram(torch.abs(spec[0]), title=\"Original\", aspect=\"equal\", xmax=304)\n\nrate = 0.9\nspec_ = stretch(spec, rate)\nplot_spectrogram(torch.abs(spec_[0]), title=f\"Stretched x{rate}\", aspect=\"equal\", xmax=304)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## TimeMasking\n\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "torch.random.manual_seed(4)\n\nspec = get_spectrogram()\nplot_spectrogram(spec[0], title=\"Original\")\n\nmasking = T.TimeMasking(time_mask_param=80)\nspec = masking(spec)\n\nplot_spectrogram(spec[0], title=\"Masked along time axis\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## FrequencyMasking\n\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "torch.random.manual_seed(4)\n\nspec = get_spectrogram()\nplot_spectrogram(spec[0], title=\"Original\")\n\nmasking = T.FrequencyMasking(freq_mask_param=80)\nspec = masking(spec)\n\nplot_spectrogram(spec[0], title=\"Masked along frequency axis\")"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.4"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}